library("ggplot2")
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('geosphere')
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
# Reading in the sample CSV of rider data we made
rider_2019_sample <- read.csv('sample.csv', stringsAsFactors = TRUE)
head(rider_2019_sample)
## tripduration starttime stoptime
## 1 564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2 1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3 763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4 915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5 1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6 267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
## start.station.id start.station.name start.station.latitude
## 1 3711 E 13 St & Avenue A 40.72967
## 2 3016 Kent Ave & N 7 St 40.72037
## 3 382 University Pl & E 14 St 40.73493
## 4 359 E 47 St & Park Ave 40.75510
## 5 3295 Central Park W & W 96 St 40.79127
## 6 3377 Carroll St & Bond St 40.67861
## start.station.longitude end.station.id end.station.name end.station.latitude
## 1 -73.98068 168 W 18 St & 6 Ave 40.73971
## 2 -73.96165 3016 Kent Ave & N 7 St 40.72037
## 3 -73.99201 459 W 20 St & 11 Ave 40.74674
## 4 -73.97499 483 E 12 St & 3 Ave 40.73223
## 5 -73.96484 3142 1 Ave & E 62 St 40.76123
## 6 -73.99037 3398 Smith St & 9 St 40.67470
## end.station.longitude bikeid usertype birth.year gender
## 1 -73.99456 29807 Subscriber 1994 1
## 2 -73.96165 34411 Subscriber 1974 1
## 3 -74.00776 16078 Subscriber 1961 1
## 4 -73.98890 29904 Subscriber 1964 2
## 5 -73.96094 30247 Customer 1969 0
## 6 -73.99786 20315 Subscriber 1971 1
# Reading in the weather data set
weather_data <- read.csv('NYCWeather2019.csv', stringsAsFactors = TRUE)
head(weather_data)
## STATION NAME DATE AWND PRCP SNOW SNWD TAVG
## 1 USW00094728 NY CITY CENTRAL PARK, NY US 1/1/2019 NA 0.06 0 0 NA
## 2 USW00094728 NY CITY CENTRAL PARK, NY US 1/2/2019 NA 0.00 0 0 NA
## 3 USW00094728 NY CITY CENTRAL PARK, NY US 1/3/2019 NA 0.00 0 0 NA
## 4 USW00094728 NY CITY CENTRAL PARK, NY US 1/4/2019 NA 0.00 0 0 NA
## 5 USW00094728 NY CITY CENTRAL PARK, NY US 1/5/2019 NA 0.50 0 0 NA
## 6 USW00094728 NY CITY CENTRAL PARK, NY US 1/6/2019 NA 0.00 0 0 NA
## TMAX TMIN
## 1 58 39
## 2 40 35
## 3 44 37
## 4 47 35
## 5 47 41
## 6 49 31
# Initial summary of rider data set
str(rider_2019_sample)
## 'data.frame': 100000 obs. of 15 variables:
## $ tripduration : int 564 1158 763 915 1368 267 661 1112 520 512 ...
## $ starttime : Factor w/ 99999 levels "2019-01-01 00:56:30.7720",..: 18803 28405 14066 41002 34169 54789 95279 5247 68397 75686 ...
## $ stoptime : Factor w/ 100000 levels "2019-01-01 01:34:45.0200",..: 18804 28409 14065 41001 34174 54787 95282 5246 68395 75682 ...
## $ start.station.id : Factor w/ 825 levels "116","119","120",..: 621 86 688 538 263 348 749 80 259 545 ...
## $ start.station.name : Factor w/ 894 levels "1 Ave & E 110 St",..: 352 545 760 386 250 234 797 672 440 99 ...
## $ start.station.latitude : num 40.7 40.7 40.7 40.8 40.8 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : Factor w/ 828 levels "116","119","120",..: 15 86 752 774 184 369 623 27 333 509 ...
## $ end.station.name : Factor w/ 890 levels "1 Ave & E 110 St",..: 793 549 795 350 7 714 787 371 598 92 ...
## $ end.station.latitude : num 40.7 40.7 40.7 40.7 40.8 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : int 29807 34411 16078 29904 30247 20315 40128 33989 29972 20897 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 1 2 1 2 2 2 ...
## $ birth.year : int 1994 1974 1961 1964 1969 1971 1969 1960 1972 1966 ...
## $ gender : int 1 1 1 2 0 1 0 1 1 1 ...
summary(rider_2019_sample)
## tripduration starttime
## Min. : 61.0 2019-11-22 17:59:58.4760: 2
## 1st Qu.: 362.0 2019-01-01 00:56:30.7720: 1
## Median : 614.0 2019-01-01 01:35:30.5010: 1
## Mean : 950.8 2019-01-01 02:04:41.7180: 1
## 3rd Qu.: 1075.0 2019-01-01 02:25:28.9700: 1
## Max. :2769536.0 2019-01-01 02:33:50.6550: 1
## (Other) :99993
## stoptime start.station.id
## 2019-01-01 01:34:45.0200: 1 519 : 810
## 2019-01-01 01:51:55.8730: 1 3255 : 617
## 2019-01-01 02:13:13.4810: 1 497 : 602
## 2019-01-01 02:29:13.1090: 1 402 : 561
## 2019-01-01 03:04:23.8640: 1 435 : 551
## 2019-01-01 04:09:48.6020: 1 (Other):96523
## (Other) :99994 NA's : 336
## start.station.name start.station.latitude start.station.longitude
## Pershing Square North: 810 Min. :40.66 Min. :-74.03
## 8 Ave & W 31 St : 617 1st Qu.:40.72 1st Qu.:-74.00
## E 17 St & Broadway : 602 Median :40.74 Median :-73.98
## Broadway & E 22 St : 561 Mean :40.74 Mean :-73.98
## W 21 St & 6 Ave : 551 3rd Qu.:40.76 3rd Qu.:-73.97
## Broadway & E 14 St : 548 Max. :40.85 Max. :-73.88
## (Other) :96311
## end.station.id end.station.name end.station.latitude
## 519 : 792 Pershing Square North: 792 Min. :40.66
## 402 : 636 Broadway & E 22 St : 636 1st Qu.:40.72
## 3255 : 632 8 Ave & W 31 St : 632 Median :40.74
## 497 : 623 E 17 St & Broadway : 623 Mean :40.74
## 285 : 547 Broadway & E 14 St : 547 3rd Qu.:40.76
## (Other):96426 W 21 St & 6 Ave : 544 Max. :40.86
## NA's : 344 (Other) :96226
## end.station.longitude bikeid usertype birth.year
## Min. :-74.03 Min. :14529 Customer :14054 Min. :1885
## 1st Qu.:-74.00 1st Qu.:25346 Subscriber:85946 1st Qu.:1970
## Median :-73.99 Median :30918 Median :1983
## Mean :-73.98 Mean :29674 Mean :1980
## 3rd Qu.:-73.97 3rd Qu.:35049 3rd Qu.:1990
## Max. :-73.89 Max. :42046 Max. :2003
##
## gender
## Min. :0.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.161
## 3rd Qu.:1.000
## Max. :2.000
##
# Initial summart of weather data set
str(weather_data)
## 'data.frame': 365 obs. of 10 variables:
## $ STATION: Factor w/ 1 level "USW00094728": 1 1 1 1 1 1 1 1 1 1 ...
## $ NAME : Factor w/ 1 level "NY CITY CENTRAL PARK, NY US": 1 1 1 1 1 1 1 1 1 1 ...
## $ DATE : Factor w/ 365 levels "1/1/2019","1/10/2019",..: 1 12 23 26 27 28 29 30 31 2 ...
## $ AWND : num NA NA NA NA NA NA NA NA NA NA ...
## $ PRCP : num 0.06 0 0 0 0.5 0 0 0.17 0.06 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TAVG : logi NA NA NA NA NA NA ...
## $ TMAX : int 58 40 44 47 47 49 34 45 45 34 ...
## $ TMIN : int 39 35 37 35 41 31 25 34 34 28 ...
summary(rider_2019_sample)
## tripduration starttime
## Min. : 61.0 2019-11-22 17:59:58.4760: 2
## 1st Qu.: 362.0 2019-01-01 00:56:30.7720: 1
## Median : 614.0 2019-01-01 01:35:30.5010: 1
## Mean : 950.8 2019-01-01 02:04:41.7180: 1
## 3rd Qu.: 1075.0 2019-01-01 02:25:28.9700: 1
## Max. :2769536.0 2019-01-01 02:33:50.6550: 1
## (Other) :99993
## stoptime start.station.id
## 2019-01-01 01:34:45.0200: 1 519 : 810
## 2019-01-01 01:51:55.8730: 1 3255 : 617
## 2019-01-01 02:13:13.4810: 1 497 : 602
## 2019-01-01 02:29:13.1090: 1 402 : 561
## 2019-01-01 03:04:23.8640: 1 435 : 551
## 2019-01-01 04:09:48.6020: 1 (Other):96523
## (Other) :99994 NA's : 336
## start.station.name start.station.latitude start.station.longitude
## Pershing Square North: 810 Min. :40.66 Min. :-74.03
## 8 Ave & W 31 St : 617 1st Qu.:40.72 1st Qu.:-74.00
## E 17 St & Broadway : 602 Median :40.74 Median :-73.98
## Broadway & E 22 St : 561 Mean :40.74 Mean :-73.98
## W 21 St & 6 Ave : 551 3rd Qu.:40.76 3rd Qu.:-73.97
## Broadway & E 14 St : 548 Max. :40.85 Max. :-73.88
## (Other) :96311
## end.station.id end.station.name end.station.latitude
## 519 : 792 Pershing Square North: 792 Min. :40.66
## 402 : 636 Broadway & E 22 St : 636 1st Qu.:40.72
## 3255 : 632 8 Ave & W 31 St : 632 Median :40.74
## 497 : 623 E 17 St & Broadway : 623 Mean :40.74
## 285 : 547 Broadway & E 14 St : 547 3rd Qu.:40.76
## (Other):96426 W 21 St & 6 Ave : 544 Max. :40.86
## NA's : 344 (Other) :96226
## end.station.longitude bikeid usertype birth.year
## Min. :-74.03 Min. :14529 Customer :14054 Min. :1885
## 1st Qu.:-74.00 1st Qu.:25346 Subscriber:85946 1st Qu.:1970
## Median :-73.99 Median :30918 Median :1983
## Mean :-73.98 Mean :29674 Mean :1980
## 3rd Qu.:-73.97 3rd Qu.:35049 3rd Qu.:1990
## Max. :-73.89 Max. :42046 Max. :2003
##
## gender
## Min. :0.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.161
## 3rd Qu.:1.000
## Max. :2.000
##
# Creating columns of just month, day, and year
weather_data$DATE <- as.Date(weather_data$DATE, format = "%m/%d/%Y")
weather_data$Month <- format(weather_data$DATE, "%m")
weather_data$Day <- format(weather_data$DATE, "%d")
weather_data$Year <- format(weather_data$DATE, "%Y")
# Creating columns of just month, day, and year
rider_2019_sample$DATE <- as.Date(rider_2019_sample$starttime, format = "%Y-%m-%d")
rider_2019_sample$Month <- format(rider_2019_sample$DATE, "%m")
rider_2019_sample$Day <- format(rider_2019_sample$DATE, "%d")
rider_2019_sample$Year <- format(rider_2019_sample$DATE, "%Y")
rider_2019_sample$age <- 2019 - as.numeric(as.character(rider_2019_sample$birth.year))
rider_2019_sample <- filter(rider_2019_sample, age <= 80)
# Combining data frames to compare data
edited_weather <- select(weather_data,
PRCP,
SNOW,
AWND,
DATE)
edited_rider <- select(rider_2019_sample,
age,
gender,
usertype,
tripduration,
start.station.latitude,
start.station.longitude,
start.station.id,
start.station.name,
end.station.latitude,
end.station.longitude,
end.station.id,
end.station.name,
DATE,
Day,
Month,
Year)
total_data = merge(edited_weather, edited_rider, by.x="DATE", by.y="DATE", all.x=TRUE)
head(total_data)
## DATE PRCP SNOW AWND age gender usertype tripduration
## 1 2019-01-01 0.06 0 NA 52 1 Subscriber 1166
## 2 2019-01-01 0.06 0 NA 33 1 Subscriber 532
## 3 2019-01-01 0.06 0 NA 55 1 Subscriber 263
## 4 2019-01-01 0.06 0 NA 29 1 Subscriber 196
## 5 2019-01-01 0.06 0 NA 28 1 Subscriber 710
## 6 2019-01-01 0.06 0 NA 37 2 Subscriber 312
## start.station.latitude start.station.longitude start.station.id
## 1 40.72037 -73.96165 3016
## 2 40.67583 -73.95617 3569
## 3 40.74517 -73.98683 474
## 4 40.72308 -73.98584 3656
## 5 40.75187 -73.97771 519
## 6 40.71422 -73.98135 502
## start.station.name end.station.latitude end.station.longitude
## 1 Kent Ave & N 7 St 40.72080 -73.95485
## 2 Franklin Ave & St Marks Ave 40.69073 -73.95133
## 3 5 Ave & E 29 St 40.74034 -73.98955
## 4 E 2 St & Avenue A 40.72087 -73.98086
## 5 Pershing Square North 40.73222 -73.98166
## 6 Henry St & Grand St 40.72217 -73.98369
## end.station.id end.station.name Day Month Year
## 1 3101 N 12 St & Bedford Ave 01 01 2019
## 2 3056 Kosciuszko St & Nostrand Ave 01 01 2019
## 3 402 Broadway & E 22 St 01 01 2019
## 4 150 E 2 St & Avenue C 01 01 2019
## 5 504 1 Ave & E 16 St 01 01 2019
## 6 301 E 2 St & Avenue B 01 01 2019
# Distance between start and end station in Meters
total_data <- mutate(total_data,
distance = distHaversine(cbind(total_data$start.station.longitude,
total_data$start.station.latitude),
cbind(total_data$end.station.longitude,
total_data$end.station.latitude)))
total_data <- filter(total_data, tripduration <= 3600)
head(total_data)
## DATE PRCP SNOW AWND age gender usertype tripduration
## 1 2019-01-01 0.06 0 NA 52 1 Subscriber 1166
## 2 2019-01-01 0.06 0 NA 33 1 Subscriber 532
## 3 2019-01-01 0.06 0 NA 55 1 Subscriber 263
## 4 2019-01-01 0.06 0 NA 29 1 Subscriber 196
## 5 2019-01-01 0.06 0 NA 28 1 Subscriber 710
## 6 2019-01-01 0.06 0 NA 37 2 Subscriber 312
## start.station.latitude start.station.longitude start.station.id
## 1 40.72037 -73.96165 3016
## 2 40.67583 -73.95617 3569
## 3 40.74517 -73.98683 474
## 4 40.72308 -73.98584 3656
## 5 40.75187 -73.97771 519
## 6 40.71422 -73.98135 502
## start.station.name end.station.latitude end.station.longitude
## 1 Kent Ave & N 7 St 40.72080 -73.95485
## 2 Franklin Ave & St Marks Ave 40.69073 -73.95133
## 3 5 Ave & E 29 St 40.74034 -73.98955
## 4 E 2 St & Avenue A 40.72087 -73.98086
## 5 Pershing Square North 40.73222 -73.98166
## 6 Henry St & Grand St 40.72217 -73.98369
## end.station.id end.station.name Day Month Year distance
## 1 3101 N 12 St & Bedford Ave 01 01 2019 576.0106
## 2 3056 Kosciuszko St & Nostrand Ave 01 01 2019 1707.3540
## 3 402 Broadway & E 22 St 01 01 2019 584.0158
## 4 150 E 2 St & Avenue C 01 01 2019 486.4067
## 5 504 1 Ave & E 16 St 01 01 2019 2213.1388
## 6 301 E 2 St & Avenue B 01 01 2019 907.8033
# Reclassifying the genders
# 0=unknown, 1=male, 2=female
total_data$gender <- ifelse(total_data$gender == 0, "Unknown",
ifelse(total_data$gender == 1, "Male", "Female"))
# Seeing the split of genders who rented bikes
total_data %>%
ggplot(aes(x=gender, fill=gender)) +
geom_bar() + theme(legend.position="none") +
ggtitle("Bike Rental Counts by Gender")
# Seeing the split of user type who rented bikes
total_data %>%
ggplot(aes(x=usertype, fill=usertype)) +
geom_bar() +
theme(legend.position="none") +
ggtitle("Bike Rental Counts by User Type")
# Range of all bike rides
trip_duration_stats <- filter(total_data) %>%
# min range of tripduration
summarise(duration_range_min = min(tripduration, na.rm=TRUE),
# max range of tripduration
duration_range_max = max(tripduration, na.rm=TRUE),
# average length of bike ride
duration_mean = mean(tripduration, na.rm=TRUE),
# standard deviation of bike ride
duration_sd = sd(tripduration, na.rm=TRUE))
trip_duration_stats
## duration_range_min duration_range_max duration_mean duration_sd
## 1 61 3599 789.341 587.415
# Range of all bike rides affected by rain
total_data_rain <- filter(total_data, SNOW == 0, PRCP > 0) %>%
# min range of tripduration
summarise(duration_range_min = min(tripduration, na.rm=TRUE),
# max range of tripduration
duration_range_max = max(tripduration, na.rm=TRUE),
# average length of bike ride affected by rain
duration_mean = mean(tripduration, na.rm=TRUE),
# standard deviation of bike ride affected by rain
duration_sd = sd(tripduration, na.rm=TRUE))
total_data_rain
## duration_range_min duration_range_max duration_mean duration_sd
## 1 61 3598 777.5114 575.1325
# Range of all bike rides affected by snow
total_data_snow <- filter(total_data, SNOW > 0) %>%
# min range of tripduration
summarise(duration_range_min = min(tripduration, na.rm=TRUE),
# max range of tripduration
duration_range_max = max(tripduration, na.rm=TRUE),
# average length of bike ride affected by snow
duration_mean = mean(tripduration, na.rm=TRUE),
# standard deviation of bike ride affected by snow
duration_sd = sd(tripduration, na.rm=TRUE))
total_data_snow
## duration_range_min duration_range_max duration_mean duration_sd
## 1 62 3548 660.3067 525.4768
# Range of all bike rides affected by wind
total_data_wind <- filter(total_data, SNOW == 0, PRCP == 0, AWND > 0) %>%
# min range of tripduration
summarise(duration_range_min = min(tripduration, na.rm=TRUE),
# max range of tripduration
duration_range_max = max(tripduration, na.rm=TRUE),
# average length of bike ride affected by snow
duration_mean = mean(tripduration, na.rm=TRUE),
# standard deviation of bike ride affected by snow
duration_sd = sd(tripduration, na.rm=TRUE))
total_data_wind
## duration_range_min duration_range_max duration_mean duration_sd
## 1 61 3599 816.5905 601.8395
# Combine above dataframes into one dataframe for side-by-side comparison
dataframe_list = list("Total Data" = trip_duration_stats,
"Rain Data" = total_data_rain,
"Snow Data" = total_data_snow,
"Wind Data" = total_data_wind)
# Can also do rbind(trip_duration_stats, total_data_rain, etc) but this keeps source table names defined above
do.call(rbind, dataframe_list)
## duration_range_min duration_range_max duration_mean duration_sd
## Total Data 61 3599 789.3410 587.4150
## Rain Data 61 3598 777.5114 575.1325
## Snow Data 62 3548 660.3067 525.4768
## Wind Data 61 3599 816.5905 601.8395
# Average rain per month
total_data %>%
filter(SNOW == 0) %>%
group_by(Month) %>%
summarise(average_rain = mean(PRCP, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## Month average_rain
## <chr> <dbl>
## 1 01 0.0790
## 2 02 0.0668
## 3 03 0.0631
## 4 04 0.122
## 5 05 0.131
## 6 06 0.149
## 7 07 0.158
## 8 08 0.100
## 9 09 0.0217
## 10 10 0.123
## 11 11 0.0386
## 12 12 0.170
# Average snow per month
total_data %>%
group_by(Month) %>%
summarise(average_snow = mean(SNOW, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## Month average_snow
## <chr> <dbl>
## 1 01 0.0303
## 2 02 0.0567
## 3 03 0.189
## 4 04 0
## 5 05 0
## 6 06 0
## 7 07 0
## 8 08 0
## 9 09 0
## 10 10 0
## 11 11 0
## 12 12 0.0675
# Average wind speed per month
total_data %>%
group_by(Month) %>%
summarise(average_wind_speed = mean(AWND, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## Month average_wind_speed
## <chr> <dbl>
## 1 01 NaN
## 2 02 NaN
## 3 03 4.92
## 4 04 4.35
## 5 05 3.73
## 6 06 4.11
## 7 07 3.41
## 8 08 3.85
## 9 09 4.29
## 10 10 5.25
## 11 11 5.30
## 12 12 6.34
# mean returns NaN if all values in group (ex: jan and feb) are NA
# Trip duration by age of riders and rain amount
plot_data <- total_data %>%
filter(SNOW == 0) %>%
group_by(age) %>%
summarise(mean_PRCP_by_age = mean(PRCP),
mean_duration = mean(tripduration))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_data %>%
ggplot(aes(x = age, y = mean_PRCP_by_age)) +
geom_point(alpha =0.9, shape = 18, colour = "blue", size = plot_data$mean_duration/150) +
geom_smooth(colour = "orange")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Mean Wind by Age of Rider
total_data %>%
group_by(age) %>%
summarise(mean_AWND_by_age = mean(AWND,na.rm = TRUE)) %>%
ggplot(aes(x = age, y = mean_AWND_by_age)) + geom_line() + geom_smooth()
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Average ride time when it's raining
total_data %>%
filter(PRCP > 0, SNOW == 0) %>%
summarise(prcp_duration_mean = mean(tripduration))
## prcp_duration_mean
## 1 777.5114
total_data %>%
filter(PRCP > 0, SNOW == 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Average ride time when it's snowing
total_data %>%
filter(SNOW > 0) %>%
summarise(snow_duration_mean = mean(tripduration))
## snow_duration_mean
## 1 660.3067
total_data %>%
filter(SNOW > 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Average ride time when it's windy
total_data %>%
filter(AWND > 0) %>%
summarise(wind_duration_mean = mean(tripduration))
## wind_duration_mean
## 1 803.9685
total_data %>%
filter(AWND > 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Number of rides over average time sin weather effects
ride_num <- total_data %>%
filter(tripduration > trip_duration_stats[1,3]) %>%
count()
ride_num[1,1]
## [1] 37200
# Number of rides over average time with rain
rain_num <- total_data %>%
filter(SNOW == 0, PRCP > 0, tripduration > trip_duration_stats[1,3]) %>%
count()
rain_num[1,1]
## [1] 12452
# Number of rides over average time with snow
snow_num <- total_data %>%
filter(SNOW > 0, tripduration > trip_duration_stats[1,3]) %>%
count()
snow_num[1,1]
## [1] 503
# Number of rides over average time with wind
wind_num <- total_data %>%
filter(AWND > 0, tripduration > trip_duration_stats[1,3]) %>%
count()
wind_num[1,1]
## [1] 34200
# Speed of the rider
total_data$speed <- total_data$distance/total_data$tripduration
# Average speed of all riders
all_ride <- total_data %>%
summarise(average_speed = mean(speed))
# Average speed of young riders
young_ride <- total_data %>%
filter(age <= 45) %>%
summarise(young_average = mean(speed))
# Average speed of old riders
old_ride <- total_data %>%
filter(age >= 65) %>%
summarise(old_average = mean(speed))
# Average speed of female riders
fem_ride <- total_data %>%
filter(gender == "Female") %>%
summarise(female_average = mean(speed))
# Average speed of male riders
male_ride <- total_data %>%
filter(gender == "Male") %>%
summarise(male_average = mean(speed))
# Average speed of subscribers
sub_ride <- total_data %>%
filter(usertype == "Customer") %>%
summarise(customer_average = mean(speed))
# Average speed of customers
cust_ride <- total_data %>%
filter(usertype == "Subscriber") %>%
summarise(subscriber_average = mean(speed))
cbind(all_ride, young_ride, old_ride, fem_ride, male_ride, sub_ride, cust_ride)
## average_speed young_average old_average female_average male_average
## 1 2.462556 2.538826 2.19201 2.326377 2.572124
## customer_average subscriber_average
## 1 1.801693 2.565832
# Scatter Plot of speed by age
total_data %>%
ggplot(aes(x = age, y = speed, colour = gender)) +
geom_point(alpha = .4, size = 1.5) +
scale_colour_manual(name = 'Gender',
values = setNames(c('blue','magenta', 'dark green'),
c('Male', 'Female', 'Unknown'))) +
geom_smooth(method='lm', colour = 'black') +
facet_wrap(~gender) + # make a plot per gender. look up facet_wrap for other fun ways to do this
labs(title="Average Speed of Riders by Age", x="Speed", y="Age")
## `geom_smooth()` using formula 'y ~ x'
# Boxplot of speed by gender
total_data %>%
ggplot(aes(x = gender, y = speed, colour = gender)) +
geom_boxplot(outlier.colour = 'red') +
scale_colour_manual(name = 'Gender',
values = setNames(c('blue','magenta', 'dark green'),
c('Male', 'Female', 'Unknown'))) +
labs(title="Speed of Riders by Gender", x="Gender", y="Speed")
# Boxplot of speed by customer type
total_data %>%
ggplot(aes(x = usertype, y = speed, colour = usertype)) +
geom_boxplot(outlier.colour = 'red') +
scale_colour_manual(name = 'User Type',
values = setNames(c('purple', 'orange'),
c('Subscriber', 'Customer'))) +
labs(title="Speed of Riders by Customer Type", x="Customer Type", y="Speed")
top_height <- max(total_data$start.station.latitude) - min(total_data$start.station.latitude)
top_width <- max(total_data$start.station.longitude) - min(total_data$start.station.longitude)
top_borders <- c(bottom = min(total_data$start.station.latitude) - 0.1 * top_height,
top = max(total_data$start.station.latitude) + 0.1 * top_height,
left = min(total_data$start.station.longitude) - 0.2 * top_width,
right = max(total_data$start.station.longitude) + 0.2 * top_width)
start <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
start_map <- ggmap(start, extent = "device", legend = "topright")
start_map + stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..,), size = 2, bins = 10, data = total_data, geom = "polygon", na.rm=TRUE,
) + labs( fill = "Density", title = "Start Location Density") + guides(alpha = F)
This graph shows that most bike trips in 2019 start in the center of NYC, with relatively few in the boroughs by comparison.
# convert dates to weekdays
total_data$day_of_week = weekdays(total_data$DATE)
start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) +
guides(alpha = F) + labs(fill = "Density", title = "Start Location Density by Day of Week") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ day_of_week) +
theme(legend.position = "right")
From these charts, we can see that the Saturday and Sunday location densities are slightly more spread than the weekdays, suggesting that the weekend trips are less concentrated in the inner city, albeit still focused in Manhattan.
start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) +
guides(alpha = F) + labs(fill = "Density", title = "Start Location Density by User Type") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ usertype) +
theme(legend.position = "right")
This graph shows that there is larger focus on downtown start locations for Subscribers, whereas Customers are spread out along Manhattan and are present in the boroughs as well.
## break down by one standard deviation above and below average of trip duration
ggmap(start) +
geom_point(data = total_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
col = tripduration)) +
scale_color_gradient(low = "yellow", high = "red")
## before noon and after noon
end <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
end_map <- ggmap(end, extent = "device", legend = "topright")
end_map + stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = total_data,
geom = "polygon"
)
end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) +
guides(alpha = F) + labs(fill = "Density", title = "End Location Density by Day of Week") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ day_of_week) +
theme(legend.position = "right")
We see the same pattern breakdown in the end location density as we do the start locations, suggesting that weekends have a more spread out end locations than the weekdays.
total_rides = count(total_data)
test = total_data
test$start.station.name = as.character(test$start.station.name)
test$end.station.name = as.character(test$end.station.name)
test <- test[test$start.station.name==test$end.station.name, ]
same_station = count(test)
symmetric = same_station / total_rides
asymmetric = 1 - symmetric
Only {r} symmetric% of rides start and end at the same station, which means that {r} asymmetric% of rides are asymmetric traffic.
start_popularity = sort(table(total_data$start.station.name), decreasing=TRUE, na.rm=TRUE)
top10 = round(length(unique(total_data$start.station.name, na.rm=TRUE))*0.1)
top_10 = head(start_popularity, top10)
barplot(top_10)
top_starts = as.data.frame(top_10)
top_10rides = sum(top_starts$Freq)
top10_rides = top_10rides / total_rides
{r}top10_rides of bike rides start come from the top 10% most used station (which are:{r} top_10)
count_starts = as.data.frame(table(total_data$start.station.name), na.rm=TRUE)
names(count_starts) = c("station", "starts")
count_ends = as.data.frame(table(total_data$end.station.name), na.rm=TRUE)
names(count_ends) = c("station", "ends")
station_flow = as.data.frame(merge(count_starts, count_ends, by.x="station", by.y="station", all.x=TRUE, na.rm=TRUE))
station_flow$net = station_flow$starts / station_flow$ends
station_flow = na.omit(station_flow)
station_flow %>% mutate(station = fct_reorder(station, net)) %>% ggplot(aes(x=station, y=net)) + geom_bar(stat = "identity")+ geom_hline(yintercept=1, linetype="dashed", color = "red") + labs(x="Stations", y="Total Starts / Total Ends in 2019", title = "Net 2019 Flow of Bikes per Station") + theme(axis.text.x = element_blank())
The chart above depicts each station’s inflow/outflow of bikes in 2019. Those with a value greater than 1 show that they have a higher rate of bikes starting at their station than ending at their station. These stations would be important to target when thinking about rebalancing bikes, as they overall have more bikes leaving them then coming to them. Similarly, those with the lowest start/end ratios have more bikes ending at their station than leaving, making them prime candidates for moving their surplus to a station in more need.
surplus_stations = station_flow[station_flow$net < 0.75,]
deficit_stations = station_flow[station_flow$net > 1.25,]
surplus_stations
## station starts ends net
## 22 12 Ave & W 125 St 2 3 0.6666667
## 42 23 Ave & 27 St 16 22 0.7272727
## 72 31 Ave & Crescent St 21 32 0.6562500
## 79 31 St & Northern Blvd 19 27 0.7037037
## 81 34 Ave & 13 St 5 9 0.5555556
## 84 34 St & 35 Ave 16 27 0.5925926
## 101 44 Dr & Jackson Ave 62 89 0.6966292
## 118 5 St & 51 Ave 19 26 0.7307692
## 120 5 St & Market St 11 22 0.5000000
## 142 Adam Clayton Powell Blvd & W 118 St 26 40 0.6500000
## 144 Adam Clayton Powell Blvd & W 126 St 27 38 0.7105263
## 151 Amsterdam Ave & W 125 St 35 47 0.7446809
## 169 Battery Pl & Greenwich St 5 9 0.5555556
## 174 Bedford Ave & Montgomery St 6 9 0.6666667
## 215 Bushwick Ave & Forrest St 1 2 0.5000000
## 216 Bushwick Ave & Harman St 1 3 0.3333333
## 234 Carroll St & Bond St 59 81 0.7283951
## 236 Carroll St & Franklin Ave 11 16 0.6875000
## 272 Clinton St & Centre St 9 15 0.6000000
## 282 Columbia St & W 9 St 18 25 0.7200000
## 286 Commerce St & Van Brunt St 16 22 0.7272727
## 305 DeKalb Ave & Hudson Ave 122 164 0.7439024
## 307 DeKalb Ave & Skillman St 3 6 0.5000000
## 311 Ditmars Blvd & 19 St 20 36 0.5555556
## 313 Division Ave & Hooper St 2 4 0.5000000
## 316 Division St & Bowery (old) 5 8 0.6250000
## 325 Dwight St & Van Dyke St 10 15 0.6666667
## 330 E 103 St & Lexington Ave 42 60 0.7000000
## 344 E 118 St & 1 Ave 30 46 0.6521739
## 346 E 118 St & Madison Ave 20 40 0.5000000
## 375 E 35 St & 3 Ave 8 12 0.6666667
## 418 E 71 St & 2 Ave 2 4 0.5000000
## 448 E 98 St & Park Ave 29 40 0.7250000
## 487 Garfield Pl & 8 Ave 46 62 0.7419355
## 510 Halsey St & Broadway 1 2 0.5000000
## 515 Harrison Pl & Porter Ave 1 3 0.3333333
## 517 Hart St & Wyckoff Ave 10 16 0.6250000
## 548 Kingsland Ave & Nassau Ave 8 18 0.4444444
## 562 Lafayette St & Jersey St S 23 33 0.6969697
## 568 Lenox Ave & W 115 St 32 66 0.4848485
## 602 Marcus Garvey Blvd & Macon St 26 43 0.6046512
## 628 Morningside Dr & Amsterdam Ave 18 25 0.7200000
## 662 Pearl St & Anchorage Pl 32 45 0.7111111
## 691 Railroad Ave & Kay Ave 4 6 0.6666667
## 693 Richards St & Delavan St 11 18 0.6111111
## 730 Stagg St & Morgan Ave 9 17 0.5294118
## 741 Stewart Ave & Johnson Ave 2 3 0.6666667
## 749 Suydam St & St. Nicholas Ave 2 3 0.6666667
## 750 Throop Ave & Myrtle Ave 14 23 0.6086957
## 758 Union St & Bedford Ave 13 22 0.5909091
## 785 W 129 St & Convent Ave 13 22 0.5909091
## 830 W 47 St & 6 Ave 2 6 0.3333333
## 886 Wilson Ave & Troutman St 0 4 0.0000000
## 887 Withers St & Kingsland Ave 3 5 0.6000000
## 889 Wyckoff Av & Jefferson St 27 41 0.6585366
deficit_stations
## station starts ends net
## 6 1 Ave & E 5 St 4 1 4.000000
## 13 10 Hudson Yards 31 18 1.722222
## 20 11 St & 35 Ave 11 4 2.750000
## 29 2 Ave & 9 St 10 7 1.428571
## 37 21 St & 36 Ave 9 7 1.285714
## 38 21 St & 38 Ave 7 3 2.333333
## 43 24 Ave & 26 St 39 31 1.258065
## 46 27 Ave & 3 St 4 1 4.000000
## 48 27 Ave & 9 St 20 11 1.818182
## 50 28 Ave & 35 St 37 24 1.541667
## 52 28 St & 36 Ave 27 21 1.285714
## 62 3 Ave & E 95 St 35 25 1.400000
## 68 30 Ave & 21 St 21 16 1.312500
## 70 31 Ave & 30 St 34 25 1.360000
## 82 34 Ave & 21 St 22 13 1.692308
## 85 35 Ave & 10 St 19 13 1.461538
## 92 37 St & 24 Ave 20 11 1.818182
## 95 4 Ave & 2 St 9 4 2.250000
## 97 40 Ave & 9 St 20 14 1.428571
## 98 40 Ave & Crescent St 15 4 3.750000
## 100 44 Dr & 21 St 67 41 1.634146
## 104 47 Ave & 31 St 21 16 1.312500
## 110 5 Ave & E 103 St 140 106 1.320755
## 125 6 Ave & Spring St 34 15 2.266667
## 141 Adam Clayton Powell Blvd & W 115 St 28 16 1.750000
## 158 Atlantic Ave & Furman St 160 120 1.333333
## 162 Avenue D & E 8 St 130 84 1.547619
## 176 Bedford Ave & S 9 St 44 34 1.294118
## 177 Bergen St & Flatbush Ave 90 65 1.384615
## 179 Bergen St & Vanderbilt Ave 87 64 1.359375
## 193 Broadway & Hancock St 2 1 2.000000
## 197 Broadway & Roebling St 117 88 1.329545
## 218 Bushwick Ave & McKibbin St 16 9 1.777778
## 219 Bushwick Ave & Powers St 43 31 1.387097
## 220 Bushwick Ave & Stagg St 12 4 3.000000
## 221 Butler St & Court St 60 46 1.304348
## 225 Calyer St & Jewel St 3 1 3.000000
## 241 Cedar St & Myrtle Ave 36 28 1.285714
## 242 Center Blvd & 48 Ave 51 37 1.378378
## 245 Central Ave & Flushing Ave 23 7 3.285714
## 246 Central Ave & Starr Street 31 24 1.291667
## 248 Central Park North & Adam Clayton Powell Blvd 183 140 1.307143
## 265 Clermont Ave & Park Ave 49 37 1.324324
## 268 Cliff St & Fulton St (Old) 9 5 1.800000
## 270 Clinton Ave & Myrtle Ave 130 96 1.354167
## 290 Court St & Nelson St 44 35 1.257143
## 317 Dock 72 Way & Market St 20 15 1.333333
## 318 Douglass St & 3 Ave 77 61 1.262295
## 323 Duane St & Greenwich St 126 90 1.400000
## 340 E 114 St & 1 Ave 38 29 1.310345
## 350 E 123 St & Lexington Ave 37 29 1.275862
## 389 E 5 St & Avenue C 140 111 1.261261
## 395 E 53 St & Lexington Ave 24 18 1.333333
## 402 E 58 St & 1 Ave (NW Corner) 106 84 1.261905
## 431 E 82 St & East End Ave 78 61 1.278689
## 438 E 88 St & Park Ave 52 41 1.268293
## 445 E 95 St & 3 Ave 14 11 1.272727
## 447 E 97 St & Madison Ave 94 75 1.253333
## 475 Frost St & Meeker Ave 30 22 1.363636
## 481 Fulton St & Irving Pl 18 14 1.285714
## 482 Fulton St & Rockwell Pl 70 54 1.296296
## 489 Gold St & Frankfort St 6 2 3.000000
## 495 Grand Army Plaza & Plaza St West 135 94 1.436170
## 511 Halsey St & Tompkins Ave 47 31 1.516129
## 537 Jackson Ave & 46 Rd 31 24 1.291667
## 541 Jefferson St & Cypress Ave 2 1 2.000000
## 551 Knickerbocker Ave & George St 19 10 1.900000
## 554 Knickerbocker Ave & Thames St 15 10 1.500000
## 557 Lafayette Ave & Classon Ave 88 69 1.275362
## 559 Lafayette Ave & St James Pl 99 72 1.375000
## 566 Lefferts Pl & Franklin Ave 50 39 1.282051
## 595 Madison Ave & E 120 St 29 21 1.380952
## 601 Manhattan Av & Leonard St 2 1 2.000000
## 603 Marcy Ave & Lafayette Ave 34 18 1.888889
## 607 McKibbin St & Manhattan Ave 38 24 1.583333
## 608 Melrose St & Broadway 3 1 3.000000
## 609 Menahan St & Central Ave 3 1 3.000000
## 621 Monroe St & Tompkins Ave 49 37 1.324324
## 623 Montgomery St & Franklin Ave 12 8 1.500000
## 637 N 11 St & Kent Ave 57 43 1.325581
## 644 Newton Rd & 44 St 20 15 1.333333
## 661 Park Pl & Vanderbilt Ave 84 66 1.272727
## 666 Perry St & Greenwich Ave 3 2 1.500000
## 680 Powers St & Olive St 24 19 1.263158
## 682 Prospect Park West & 8 St 67 53 1.264151
## 686 Putnam Ave & Knickerbocker Ave 3 2 1.500000
## 702 Rivington St & Ridge St 176 140 1.257143
## 704 Rogers Ave & Sterling St 28 22 1.272727
## 707 S 4 St & Wythe Ave 205 163 1.257669
## 728 St Nicholas Ave & Manhattan Ave 96 76 1.263158
## 731 Stagg St & Union Ave 76 60 1.266667
## 748 Suydam St & Knickerbocker Ave 24 19 1.263158
## 776 W 106 St & Central Park West 117 88 1.329545
## 780 W 113 St & Broadway 64 51 1.254902
## 782 W 116 St & Broadway 59 47 1.255319
## 783 W 12 St & W 4 St 59 41 1.439024
## 813 W 37 St & 10 Ave 250 198 1.262626
## 819 W 42 St & 6 Ave 2 0 Inf
## 853 W 88 St & West End Ave 54 36 1.500000
## 861 Warren St & Court St 54 36 1.500000
## 869 Waterbury St & Stagg St 14 11 1.272727
## 879 Willoughby Ave & Hall St 107 73 1.465753
## 880 Willoughby Ave & Myrtle Ave 8 3 2.666667
## 888 Wolcott St & Dwight St 18 10 1.800000
Stations with surplus:
{r} surplus_stations
Stations with deficit: {r} deficit_stations